We will use Microsoft COCO (Common Objects in Context) data set to train our "Image Caption Retrieval Model". This data set consists of pretrained 10-crop VGG19 features (Neural codes) and its corresponding text caption.
from __future__ import print_function
import os
import sys
import numpy as np
import pandas as pd
from collections import OrderedDict
DATA_PATH = 'data'
IMAGE_DATA= 'val2014'
EMBEDDING_PATH = 'embeddings'
MODEL_PATH = 'models'
You will need to create above directories and locate data set provided in directory 'data'
# DO NOT CHANGE BELOW CODE
import collections
np_train_data = np.load(os.path.join(DATA_PATH,'train_data.npy'))
np_val_data = np.load(os.path.join(DATA_PATH,'val_data.npy'))
train_data = collections.OrderedDict()
for i in range(len(np_train_data.item())):
cap = np_train_data.item()['caps']
img = np_train_data.item()['ims']
train_data['caps'] = cap
train_data['ims'] = img
val_data = collections.OrderedDict()
for i in range(len(np_val_data.item())):
cap = np_val_data.item()['caps']
img = np_val_data.item()['ims']
val_data['caps'] = cap
val_data['ims'] = img
# DO NOT CHANGE BELOW CODE
# use them for your own additional preprocessing step
# to map precomputed features and location of raw images
import json
with open(os.path.join(DATA_PATH,'instances_val2014.json')) as json_file:
coco_instances_val = json.load(json_file)
with open(os.path.join(DATA_PATH,'captions_val2014.json')) as json_file:
coco_caption_val = json.load(json_file)
# create your own function to map pairs of precomputed features and filepath of raw images
# this will be used later for visualization part
# simple approach: based on matched text caption (see json file)
# YOUR CODE HERE
#todo: mapping
######### HELPER FUNCTION
import re
#x_caption = prepare_caption(train_caption_ids, train_caps)
#string to string caption
def clean_cap(caption):
return re.sub('[^0-9a-zA-Z]+', ' ',caption).lower().strip()
def arrstrcap2arrintcap(arrcaption):
arr=np.zeros(50,dtype=int)
for id,i in enumerate(arrcaption):
try:
arr[id]=words_indices[i]
except KeyError:
arr[id]=0
return arr
def imgid2cap(imgid):
cap=[]
for i in coco_caption_val['annotations']:
if(i['image_id']==imgid):
cap.append(i['caption'])
if(len(cap)==0):
raise Exception('Caption not found!')
return cap
def cap2imgid(cap):
imgid=-1
for i in coco_caption_val['annotations']:
if(i['caption']==cap):
imgid=i['image_id']
if(imgid==-1):
raise Exception('Image not found!')
return imgid
def mapto(xtrainid):
guessedcaption=true_caption(x_caption[xtrainid])
#print(guessedcaption)
img_id=-1
for i in coco_caption_val['annotations']:
capt=re.sub('[^0-9a-zA-Z]+', ' ',i['caption']).lower().strip()
# print(capt)
if(capt==guessedcaption):
img_id=i['image_id']
break
for i in coco_instances_val['images']:
if(i['id']==img_id):
#return i['coco_url']
show_img(i['file_name'])
#return 'notfound!, guesdescaption= {}'.format(guessedcaption)
#translate array of int to a sentence
#input: np array of caption (encoded in [int])
#output: string
def true_caption(cap):
caplist=[indices_words[i] for i in cap]
strcap=""
for i in caplist:
if i!='<pad>' and i!='<unk>':
strcap+=i+' '
return strcap.strip()
def find_original_caption(image_id):
arr=['' for i in range(5)]
idx=0
for i in coco_caption_val['annotations']:
if(i['image_id']==image_id):
arr[idx]=i['caption']
idx+=1
return arr
#true_caption(x_val_caption[10])
# DO NOT CHANGE BELOW CODE
def build_dictionary(text):
wordcount = OrderedDict()
for cc in text:
words = cc.split()
for w in words:
if w not in wordcount:
wordcount[w] = 0
wordcount[w] += 1
words = list(wordcount.keys())
freqs = list(wordcount.values())
sorted_idx = np.argsort(freqs)[::-1]
worddict = OrderedDict()
worddict['<pad>'] = 0
worddict['<unk>'] = 1
for idx, sidx in enumerate(sorted_idx):
worddict[words[sidx]] = idx+2 # 0: <pad>, 1: <unk>
return worddict
# use the resulting vocabulary index as your look up dictionary
# to transform raw text into integer sequences
all_captions = []
all_captions = train_data['caps'] + val_data['caps']
# decode bytes to string format
caps = []
for w in all_captions:
caps.append(w.decode())
words_indices = build_dictionary(caps)
print ('Dictionary size: ' + str(len(words_indices)))
indices_words = dict((v,k) for (k,v) in words_indices.items())
##add custom
#words_indices = dict((k,v) for (k,v) in words_indices.items())
from keras.layers import Dense, Embedding,Input,LSTM,GRU,Lambda,add,dot,subtract, maximum
from keras.models import Model
import keras.backend as K
# YOUR CODE HERE
from keras.layers import Dense, Embedding,Dot,Input,LSTM,GRU,Add, Subtract, concatenate
#image network
img_input = Input(shape=(4096,),name='IMG_input')
condense_img = Dense(1024,name='Dense_IMG')(img_input)
import gensim
from gensim.models import KeyedVectors
path = ".."
#convert GloVe into word2vec format
#gensim.scripts.glove2word2vec.get_glove_info(path)
#gensim.scripts.glove2word2vec.glove2word2vec(path, "glove_converted.txt")
glove = KeyedVectors.load_word2vec_format("../glove_converted.txt", binary=False)
# YOUR CODE HERE
voc_size = len(indices_words)#11k ish
cap_size = 50
caption_input = Input(shape=(cap_size,),name='CAP_input')
noise_input = Input(shape=(cap_size,),name='Noise_input')
# layer for computing dot product between tensors
vocab_dim = 300 # dimensionality of your word vectors
n_symbols = voc_size + 1 # adding 1 to account for 0th index (for masking)
embedding_weights = np.zeros((n_symbols, vocab_dim))
for word,index in words_indices.items():
try:
embedding_weights[index, :] = glove[word]
except KeyError:
embedding_weights[index, :] = np.zeros(vocab_dim)
# define inputs here
embedding_layer = Embedding(output_dim=vocab_dim, input_dim=n_symbols, trainable=False)
embedding_layer.build((None,)) # if you don't do this, the next step won't work
embedding_layer.set_weights([embedding_weights])
recurrent_layer = LSTM(1024,name='recurrent_layer')
#inputs into shared layers
embed_caption = embedding_layer(caption_input)
embed_noise = embedding_layer(noise_input)
recurrent_noise = recurrent_layer(embed_noise)
recurrent_caption = recurrent_layer(embed_caption)
# YOUR CODE HERE
#noise and real score
cap_image = dot([condense_img,recurrent_caption],1,normalize=True, name='DotProd_postive_score')
noise_image = dot([condense_img,recurrent_noise],1,normalize=True, name='DotProd_negative_score')
conc = concatenate([cap_image,noise_image],axis=-1)
# YOUR CODE HERE
# define your model input and output
print ("loading the training model")
training_model = Model(inputs=[img_input,caption_input,noise_input],outputs=conc)
training_model.summary()
# YOUR CODE HERE
# define your model input and output
print ("loading sub-models for retrieving Neural codes")
caption_model = Model(inputs=caption_input, outputs=recurrent_caption)
caption_model.summary()
image_model = Model(inputs=img_input, outputs=condense_img)
image_model.summary()
We define our loss function as a loss for maximizing the margin between a positive and negative example. If we call $p_i$ the score of the positive pair of the $i$-th example, and $n_i$ the score of the negative pair of that example, the loss is:
\begin{equation*} loss = \sum_i{max(0, 1 -p_i + n_i)} \end{equation*}
from keras import backend as K
def max_margin_loss(y_true, y_pred):
print(y_pred.shape)
return K.sum(K.maximum(0.0, 1.0 - y_pred[:,0] + y_pred[:,1]))
How many times did the positive pair effectively get a higher value than the negative pair?
# YOUR CODE HERE
def accuracy(y_true, y_pred):
# YOUR CODE HERE
accuracy_ = K.mean(y_pred[:,0]>y_pred[:,1])
return accuracy_
# DO NOT CHANGE BELOW CODE
print ("compiling the training model")
training_model.compile(optimizer='adam', loss=max_margin_loss, metrics=[accuracy])
image_model.compile(optimizer='adam', loss=max_margin_loss, metrics=[accuracy])
caption_model.compile(optimizer='adam', loss=max_margin_loss, metrics=[accuracy])
#training_model.compile(optimizer='adam', loss=max_margin_loss)
# sampling one caption per image
# return image_ids, caption_ids
#['caps']['ims']
# YOUR CODE HERE
def sampling_img_cap(data):
datalen=len(data['ims'])
image_ids = np.arange(datalen)
np.random.shuffle(image_ids)
caption_ids=[image_ids[x]*5+np.random.randint(0, 5) for x in range(datalen)]
return image_ids, caption_ids
#train_image_ids, train_caption_ids = sampling_img_cap(train_data)
# transform raw text caption into integer sequences of fixed maximum length
def make_50(arr):
return_arr=np.zeros(50,dtype=int)
limit_id=min(50,len(arr))
for i in range(limit_id):
return_arr[i]+=arr[i]
return return_arr
def prepare_caption(caption_ids, caption_data):
# YOUR CODE HERE
datalen=len(caption_ids)
#[[float(y) for y in x] for x in l]
zero=np.zeros(50)
cap_transformed=[caption_data[caption_ids[x]] for x in range(datalen)]
caption_seqs = [[words_indices[word] for word in sentence.split() ] for sentence in cap_transformed ]
caption_seqs=np.asarray([make_50(i) for i in caption_seqs])
return caption_seqs
#x_caption = prepare_caption(train_caption_ids, train_caps)
# DO NOT CHANGE BELOW CODE
train_caps = []
for cap in train_data['caps']:
train_caps.append(cap.decode())
val_caps = []
for cap in val_data['caps']:
val_caps.append(cap.decode())
# DO NOT CHANGE BELOW CODE
train_image_ids, train_caption_ids = sampling_img_cap(train_data)
val_image_ids, val_caption_ids = sampling_img_cap(val_data)
x_caption = prepare_caption(train_caption_ids, train_caps)
x_image = train_data['ims'][np.array(train_image_ids)]
x_val_caption = prepare_caption(val_caption_ids, val_caps)
x_val_image = val_data['ims'][np.array(val_image_ids)]
Notice that we do not have real output with labels for training the model. Keras architecture expects labels, so we need to create dummy output -- which is numpy array of zeros. This dummy labels or output is never used since we compute loss function based on margin between positive examples (image-real caption) and negative examples (image-fake caption).
# YOUR CODE HERE
train_noise =np.copy(x_caption)#np.asarray([np.random.randint(0,50,size=50) for y in x_caption])
val_noise = np.copy(x_val_caption)#np.asarray([np.random.randint(0,50,size=50)for y in x_val_caption])
np.random.shuffle(train_noise)
np.random.shuffle(val_noise)
y_train_labels = np.zeros(10000)#((len(x_image),50))
y_val_labels = np.zeros(5000)#((len(x_val_image),50))
# YOUR CODE HERE
X_train = [x_image,x_caption,train_noise]
Y_train = y_train_labels
X_valid = [x_val_image,x_val_caption,val_noise]
Y_valid = y_val_labels
for i in range(20):
np.random.shuffle(train_noise)
np.random.shuffle(val_noise)
X_train = [x_image,x_caption,train_noise]
X_valid = [x_val_image,x_val_caption,val_noise]
training_model.fit(X_train,Y_train, validation_data=(X_valid, Y_valid), batch_size=100, epochs=1)
# DO NOT CHANGE BELOW CODE
# Save model
training_model.save(os.path.join(MODEL_PATH,'20iter_image_caption_model.h5'))
# Save weight parameters
training_model.save_weights(os.path.join(MODEL_PATH, '20iter_weights_image_caption.hdf5'))
# Save model for encoding caption and image
caption_model.save(os.path.join(MODEL_PATH,'20iter_caption_model.h5'))
image_model.save(os.path.join(MODEL_PATH,'20iter_image_model.h5'))
for i in range(20):
np.random.shuffle(train_noise)
np.random.shuffle(val_noise)
X_train = [x_image,x_caption,train_noise]
X_valid = [x_val_image,x_val_caption,val_noise]
training_model.fit(X_train,Y_train, validation_data=(X_valid, Y_valid), batch_size=100, epochs=1)
# DO NOT CHANGE BELOW CODE
# Save model
training_model.save(os.path.join(MODEL_PATH,'60iter_image_caption_model.h5'))
# Save weight parameters
training_model.save_weights(os.path.join(MODEL_PATH, '40iter_weights_image_caption.hdf5'))
# Save model for encoding caption and image
caption_model.save(os.path.join(MODEL_PATH,'40iter_caption_model.h5'))
image_model.save(os.path.join(MODEL_PATH,'40iter_image_model.h5'))
for i in range(20):
np.random.shuffle(train_noise)
np.random.shuffle(val_noise)
X_train = [x_image,x_caption,train_noise]
X_valid = [x_val_image,x_val_caption,val_noise]
training_model.fit(X_train,Y_train, validation_data=(X_valid, Y_valid), batch_size=100, epochs=1)
# DO NOT CHANGE BELOW CODE
# Save model
training_model.save(os.path.join(MODEL_PATH,'60iter_image_caption_model.h5'))
# Save weight parameters
training_model.save_weights(os.path.join(MODEL_PATH, '60iter_weights_image_caption.hdf5'))
# Save model for encoding caption and image
caption_model.save(os.path.join(MODEL_PATH,'60iter_caption_model.h5'))
image_model.save(os.path.join(MODEL_PATH,'60iter_image_model.h5'))
for i in range(20):
np.random.shuffle(train_noise)
np.random.shuffle(val_noise)
X_train = [x_image,x_caption,train_noise]
X_valid = [x_val_image,x_val_caption,val_noise]
training_model.fit(X_train,Y_train, validation_data=(X_valid, Y_valid), batch_size=100, epochs=1)
# DO NOT CHANGE BELOW CODE
# Save model
training_model.save(os.path.join(MODEL_PATH,'80iter_image_caption_model.h5'))
# Save weight parameters
training_model.save_weights(os.path.join(MODEL_PATH, '80iter_weights_image_caption.hdf5'))
# Save model for encoding caption and image
caption_model.save(os.path.join(MODEL_PATH,'80iter_caption_model.h5'))
image_model.save(os.path.join(MODEL_PATH,'80iter_image_model.h5'))
for i in range(20):
np.random.shuffle(train_noise)
np.random.shuffle(val_noise)
X_train = [x_image,x_caption,train_noise]
X_valid = [x_val_image,x_val_caption,val_noise]
training_model.fit(X_train,Y_train, validation_data=(X_valid, Y_valid), batch_size=100, epochs=1)
# DO NOT CHANGE BELOW CODE
# Save model
training_model.save(os.path.join(MODEL_PATH,'100iter_image_caption_model.h5'))
# Save weight parameters
training_model.save_weights(os.path.join(MODEL_PATH, '100iter_weights_image_caption.hdf5'))
# Save model for encoding caption and image
caption_model.save(os.path.join(MODEL_PATH,'100iter_caption_model.h5'))
image_model.save(os.path.join(MODEL_PATH,'100iter_image_model.h5'))
training_model.summary()
from keras.models import load_model
training_model=load_model(os.path.join(MODEL_PATH,'100iter_image_caption_model.h5'), custom_objects={'max_margin_loss': max_margin_loss})
# Load model for encoding caption and image
caption_model=load_model(os.path.join(MODEL_PATH,'100iter_caption_model.h5'), custom_objects={'max_margin_loss': max_margin_loss})
image_model=load_model(os.path.join(MODEL_PATH,'100iter_image_model.h5'), custom_objects={'max_margin_loss': max_margin_loss})
# YOUR CODE HERE
# Use caption_model and image_model to produce "Neural codes"
# for both image and caption from validation set
img_model_nc = Model(inputs=image_model.input, outputs=image_model.get_layer("Dense_IMG").output)
cap_model_nc = Model(inputs=caption_model.input, outputs=caption_model.get_layer("recurrent_layer").get_output_at(0))
#nc_img = img_model_nc.predict(np.append(x_image,x_val_image,axis=0))
nc_img = img_model_nc.predict(x_image)
nc_img_val = img_model_nc.predict(x_val_image)
nc_cap = cap_model_nc.predict(x_caption)
nc_cap_val = cap_model_nc.predict(x_val_caption)
# print the shapes to confirm all features are 1024-dimensional
print(nc_img.shape)
print(nc_img_val.shape)
print(nc_cap.shape)
print(nc_cap_val.shape)
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing import image
def show_img(imagename):
img = image.load_img(os.path.join(IMAGE_DATA,imagename), target_size=(224,224))
plt.imshow(img)
plt.axis("off")
plt.show()
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input, decode_predictions
vgg_model = VGG16(weights='imagenet')
fc2_model = Model(inputs=vgg_model.input, outputs=vgg_model.get_layer("fc2").output)
def load_img_preprocess(img_path):
img = image.load_img(img_path, target_size=(224, 224))
array = image.img_to_array(img)
x = np.expand_dims(array, axis=0)
x = preprocess_input(x)
return {"img": img, "array": array, "x": x}
#elephant1 = load_img_preprocess('val2014/COCO_val2014_000000000073.jpg')
def show_image_predictions(img_obj):
plt.imshow(img_obj["img"])
plt.show()
preds = vgg_model.predict(img_obj["x"])
preds_dec = decode_predictions(preds, top=5)[0]
print("Predictions:")
for pred in preds_dec:
print("{}, with probability: {}".format(pred[1],pred[2]))
print("")
#show_image_predictions(elephant1)
def include_features(img_obj):
img_obj["fc2"] = fc2_model.predict(img_obj["x"])
#include_features(elephant1)
def get_features(imgname):
elephant1=load_img_preprocess('val2014/'+imgname)
include_features(elephant1)
return np.array(elephant1["fc2"])
#get_features('COCO_val2014_000000000073.jpg')
# YOUR CODE HERE
# choose one image_id from validation set
# use this id to get filepath of image
img_id = 33499
filepath_image = 'COCO_val2014_000000033499.jpg'
# display original caption
original_caption = find_original_caption(img_id)
print(original_caption)
# DO NOT CHANGE BELOW CODE
#show_img(filepath_image)
img = image.load_img(os.path.join(IMAGE_DATA,filepath_image), target_size=(224,224))
plt.imshow(img)
plt.axis("off")
plt.show()
# function to retrieve caption, given an image query
from sklearn.neighbors import NearestNeighbors
def samearr(a,b):
same=True
for i,_ in enumerate(a):
same=(same and a[i]==b[i])
return same
def get_caption(image_filename, n=10):
feature=get_features(image_filename)
show_img(image_filename)
img_id=-1
for img in coco_instances_val['images']:
if(img['file_name']==image_filename):
img_id=img['id']
if(img_id==-1):
return Exception('Pic metadata (json file) not found!')
orig_cap=imgid2cap(img_id)
print('original caption:')
print(orig_cap)
idtrain=-1
rep1024=img_model_nc.predict(feature)
neigh = NearestNeighbors(n_neighbors=n, p=2)
neigh.fit(nc_cap)
nn = neigh.kneighbors(rep1024)
#print(nn[1])
for i in range(n):
print("guessed cap: {}, distance={}".format(true_caption(x_caption[nn[1][0][i]]),nn[0][0][i]))
return nn
# YOUR CODE HERE
get_caption('COCO_val2014_000000504439.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000510182.jpg')
get_caption('COCO_val2014_000000019292.jpg')
get_caption('COCO_val2014_000000033499.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000052005.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000064629.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000490081.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000495288.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000497106.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000505035.jpg')
# DO NOT CHANGE BELOW CODE
get_caption('COCO_val2014_000000511241.jpg')
Briefly discuss the result. Why or how it works, and why do you think it does not work at some point.
First, we make a main training model that has 3 inputs: a VGG-neural code representation of an image (4096 dimensions), array of integer representing a sequence of translated captions, and array of integer representing a sequence of translated caption noises. We also create 2 additional models for image and caption that uses layer from the training model. The loss that used is max margin loss as define in the notebook and the accuracy defined as the number of positive pair scoring higher than negative pair on average.
The noise caption defined as the real caption shuffled. We train the model 100 times, with each iterations,we use 1 epoch and reshuffle the captions. For retrieval model, we extract the neural code from image and caption model respectively. For image to caption task, we read the image representation, then use neural codes from VGG16 layer that has pre trained using IMAGENET, and then get the neural representaions using neural codes from image model. Lastly, KNN was used to determine the closest caption from image neural representation using L2-distance.
The results are decent. Most of the captions retrieved are in the same realm as the original image. Meaning the caption retrieved is about food and the original caption was also food related. However the retrieved caption might not have the specific food correct. Sometimes for example in the girl making a mess of her food, the retrieved captions get correct that there is a girl but miss out on the food part.
The network might identify certain features of an image correct and encode them the captions related to this feature are then probable to be nearby the query. If an image is closely related to an image from the training set the resulting captions are pretty accurate.
Sometimes an image has some feature which was not known by the network for example the highchair in the image of the child. It then identifies the tray as some sort of device. This device gets encoded into the vector and then caption of a girl with a device are returned. The fault here is in unseen features.
# given text query, display retrieved image, similarity score, and its original caption
def search_image(text_caption, n=10):
caption=np.zeros(50,dtype=int)
for id,i in enumerate(text_caption.split()):
if(id<50):
try:
caption[id]=words_indices[i]
except KeyError:
caption[id]=0
neigh = NearestNeighbors(n_neighbors=n, p=2)
neigh.fit(nc_img)
rep=cap_model_nc.predict(np.array([caption]))
nn = neigh.kneighbors(rep)
for id,i in enumerate(nn[1][0]):
print(mapto(i))
print('Distance = {}'.format(nn[0][0][id]))
# YOUR CODE HERE
Consider to use the following settings for image retrieval task.
# Example of text query
# text = 'two giraffes standing near trees'
# YOUR QUERY-1
text1 = 'two giraffes standing near trees'
# DO NOT CHANGE BELOW CODE
search_image(text1)
# Example of text query
# text = 'two giraffes standing near trees'
# YOUR QUERY-1
text1 = 'a clock tower extends into the sky in a cit'
# DO NOT CHANGE BELOW CODE
search_image(text1,n=5)
# Example of text query
# text = 'two giraffes standing near trees'
# YOUR QUERY-1
text1 = 'a brown bear handing out of a car with sharp teeth'
# DO NOT CHANGE BELOW CODE
search_image(text1)
# YOUR QUERY-1
text1 = 'a man flying through the air while riding a snowboard'
# DO NOT CHANGE BELOW CODE
search_image(text1)
# YOUR QUERY-2
text2 = 'bicycle'
# DO NOT CHANGE BELOW CODE
search_image(text2)
# YOUR QUERY-2
text2 = 'a man with a tie on and a head band in celebration of a holiday'
# DO NOT CHANGE BELOW CODE
search_image(text2)
# YOUR QUERY-2
text2 = 'small bathroom'
# DO NOT CHANGE BELOW CODE
search_image(text2)
# YOUR QUERY-2
text2 = 'a man holding an american flag riding down the street on a horse'
# DO NOT CHANGE BELOW CODE
search_image(text2)
# YOUR QUERY-2
text2 = 'sandwich'
# DO NOT CHANGE BELOW CODE
search_image(text2)
# YOUR QUERY-2
text2 = 'a woman in a kitchen holding a carton'
# DO NOT CHANGE BELOW CODE
search_image(text2)
Briefly discuss the result. Why or how it works, and why do you think it does not work at some point.
The results for the image retrieval are relatively good. The images resulting from the queries often closely related to the query. For example when asking for giraffes most of the images contain a giraffe eventhough sometimes few zebras and horses were retrieved. When detecting unique object such as differentiating between animals, our model could retrieve images more accurately. This is because the feature are easily distinguishable (shape of legs, body, color, etc).
But when identifying feature in similar object (for example gender identification in person) our model still cannot determine correctly most of the time. For example when query 'a woman in a kitchen holding a carton' was asked, there are man picture retrieved. Perhaps identifying a gender is not as easy as identifying a giraffe from horses where the size and the color are clearly different, while for a person it is not clearly distinguishable. Another example is bathroom often retrieved together with kitchen since they usually have dominant white background and some shelves.
This performances can be improved by adding more iterations to the model such that the loss in training is as small as possible. When we try using 40 iterations model, our model is returning train images when given 'bicycle' query which is very different in terms of shape and size. But when we use 60 iteration models, it is returning bicycle and motorcycle images which are closely related. We believe, by adding few hundreds more iterations, better retrieval model can be achieved